I. Data Proofing: Data Type Standardization

Important Data Munging Notes: - Watch out for date format when reading data in - Tag ID MUST be read in as a character value

Read in current file that contains data that has gone through initial cleaning, including…

To do + species: resolution needed for (1) inconsistent species recorded for same tag and (2) no species recorded How to resolve? Machine learning methods using R package mclust that uses covariates (weight, tail length, ear length, hind foot length, life stage, etc.) to assign species. I have done a first crack at this but need to check these methods with Justin Suraci. Potentially use his dataset as a training data set?

II. Tag ID Data Proofing

smdatN <- read_csv("/Users/tinacheng/Dropbox/SMURF_MS/Data/FERP_RodentData/DataMungingFiles/smdatN_fixed.csv",
                   col_types = cols(tag_id = col_character(),
                                    no_tag_reason = col_character()))

II. Fixing tags - Notes

  1. Fix duplicate tags
  1. FE prefix
  1. Other messy shit
#Figure out when FE0 tags started being used
smFE <- smdatN %>% 
  filter(grepl("FE", tag_id)) 
FE_startdate = min(smFE$date)
#FE_startdate
#started tagging with FE IDs in summer 2016

1. Strip down tag numbers

  • standardizes numbers to 4 digits
  • standardizes prefix to uppercase
  • replace NA with blanks
smTags1 <- smdatN %>%
  #format tag no and tag prefix, then paste together again
  mutate(tag_no = str_extract(tag_id, "(\\d)+"), #extract number from tag_id
         tag_no = str_pad(tag_no, 4, pad = "0"), #format to ####
         #create new column called tag_no_new for reformatted tags
         tag_no_new = substring(tag_no, 2, 4), #break down into just last 3 digits (removing 1000 or 2000 manually assigned)
         tag_no_new = str_pad(tag_no_new, 4, pad = "0"), #format back into ####
         #extract tag prefix
         tag_id = toupper(tag_id), #first make sure they are all uppercase
         tag_id = str_replace_all(tag_id, "NA", ""), #there were a couple "NA" characters that needed to get replaced with nothing
         tag_prefix = str_extract(tag_id, "[A-Z]+"),
         tag_id_new = ifelse(is.na(tag_prefix), tag_no_new, paste(tag_prefix, tag_no_new, sep = "")))  #extract out tag prefix
smTags1_display <- smTags1 %>% dplyr::select(quarter, year, date, day_no, x, y, animal, recap, tag_id, tag_no, tag_no_new, tag_prefix, tag_id_new)
DT::datatable(smTags1_display, filter = "top")

2. Set up logic for identifying duplicate tags and FE prefix:

Calculate metrics of distance from previous tag by date tagged, trap, and time diff from first FE tags for finding missing tag prefixes

smTags2 <- smTags1 %>% 
  ungroup() %>% 
  arrange(tag_id_new, date) %>% 
  group_by(tag_id_new) %>% 
  mutate(date = mdy(date),
         date_dist = date - lag(date), #number of days between date and previous tag date
         trap_dist = sqrt((abs(x - lag(x))^2) + (abs(y - lag(y))^2)), #distance in meters from previous tag trap
         FE_startdate = mdy(FE_startdate), #first day FE tags were used
         FE_datediff = date - FE_startdate) %>%  #number of days from when FE tags were first used
  
  #calculate date distance for just the tag number (no prefixes) to help identify missing FE prefixes in tags
  ungroup() %>% 
  arrange(tag_no_new, date) %>% 
  group_by(tag_no_new) %>% 
  mutate(date_dist_no = date - lag(date))
smTags2_display <- smTags2 %>% dplyr::select(quarter, year, date, day_no, x, y, animal, recap, tag_id_new, date_dist, trap_dist, FE_datediff)
#View(smTags2_display)
DT::datatable(smTags2_display, filter = "top")

3. Fix FE tags

Add FE prefix to stripped down tag # if…

  1. Animal is a newly tagged animal during the time in which FE tags were being used on the FERP:
  1. tag does not already have a prefix (FE or otherwise), (ii) animal is caught after FE tags started being used on the FERP, (iii) the animal does not have a tag (animal is NOT a recap)
  1. Animal is a recap but caught during a time in which FE tags were being used on the FERP:
  1. tag does not already have a prefix (FE or otherwise), (ii) animal is caught after FE tags started being used on the FERP, (ii) animal has a tag (animal IS a recap), (iv) the last time its tag was recorded is more than 1 year ago
smTags3 <- smTags2 %>% 
  #Fix tags that are likely missing FE prefix
  #assign a tag prefix "FE" if new animal tagged after or on first day FE tags used
  mutate(tag_prefix_new = ifelse(FE_datediff >= 0 & recap == "n" & !is.na(tag_id) & is.na(tag_prefix), "FE", tag_prefix), 
         #also assign tag prefix "FE" if animal is a recap, was caught after first FE tag date, and if previous tag was seen over 1 year ago
         tag_prefix_new = ifelse(!is.na(tag_prefix), tag_prefix, 
                                 ifelse(recap == "y" & FE_datediff > 0 & date_dist_no <= 365, "FE", NA))) %>% 
  ungroup() %>% 
  mutate(FEtag_change = ifelse(!is.na(tag_prefix) & tag_prefix == tag_prefix_new, "n",
                               ifelse(is.na(tag_prefix) & !is.na(tag_prefix_new), "y", NA)),
         tag_id_new = ifelse(is.na(tag_prefix_new) & is.na(tag_no_new), NA,
                             ifelse(is.na(tag_prefix_new) & !is.na(tag_no_new), tag_no_new,
                                    ifelse(!is.na(tag_prefix_new) & !is.na(tag_no_new), paste(tag_prefix_new, tag_no_new, sep = ""), NA)))) 
smTags3_display <- smTags3 %>% 
  dplyr::select(quarter, year, date, day_no, x, y, animal, recap, tag_id, tag_id_new, date_dist, trap_dist, FE_datediff, tag_prefix, tag_prefix_new, FEtag_change) 

DT::datatable(smTags3_display, filter = "top")
num_FEchange <- smTags3 %>% filter(FEtag_change == "y") %>% nrow()
num_FEchange
## [1] 6

4. Identify duplicate tags

4a. Identify new tagging events

First create a new column to identify newly caught animals based on “date distance” (# days since tag was last recorded). I did this because the recap column that should identify when new animals are caught is not reliable. Tags are recorded as “new” if…

  1. It is the earliest record of the tag (first time tag is ever recorded)

  2. The tag was recorded more than a year (date distance >= 730 days) from previous tag observation

Assumption: If an animal has not been recaught within 2 years, it is assumed dead or to have emigrated from the population. Thus, new observations of that tag are assumed to be a new animal.

ddFUN <- function(dd){
  
  smTags3 %>% 
    #Fix tags that are duplicate tags
    arrange(tag_id_new, date) %>% 
    group_by(tag_id_new) %>% 
    #create new column to identify when a new tagging event has occurred, defined by first tag event (is.na(date_dist)) and if it has been more than one year since animal was tagged compared to last tagging event 
    mutate(new_tag = ifelse(animal == "y" & is.na(date_dist), "y",
                            ifelse(animal == "y" & date_dist >= dd, "y",
                                   ifelse(animal == "y" & date_dist < dd, "n", NA))),
           dist_day = dd) %>% 
    
    #########
  ungroup() %>% 
    arrange(new_tag, tag_id_new, date) %>% #arrange data to get new tag events together
    group_by(new_tag, tag_id_new) %>% #group data 
    mutate(n = ifelse(new_tag == "y", n(), NA)) 
  
}

dd = c(365, 730, 1096)
smTags4_dd <- map_df(dd, ddFUN)
smTags4_dd %>%
  filter(!is.na(tag_id_new)) %>% 
  ggplot() +
  geom_bar(aes(x = n, fill = as.factor(dist_day)), position = "dodge", alpha = 0.4, color = "grey") +
  xlab("# times tag used") +
  scale_fill_manual(labels = c("1 yr", "2 yrs", "3 yrs"), values = c("red", "orange", "yellow")) +
  labs(fill = "date distance <= ...") +
  ggtitle("Identifying duplicate tags based on date distance")

The number of newly identified tagging events varies based on the cutoff in date distance used. In other words, if a tag is not recorded for 1 year and then recorded again, is that tag a new but duplicate tag or was the tag just not observed for 1 year? What about 2 years? 3 years? Depending on our cutoff, the number of new but duplicate tags varies. if we choose too short of a time, we are biasing ourselves towards more new duplicate tags (higher turnover of animals & higher recapture rate). But if we choose too high a value, we are biasing ourselves towards longer-lived and more sporadically caught animals (lower turnover of animals and lower recapture rate). What number to use? It’s impossible to know without more info…

I will choose the intermediary value, 2 years or 730 days.

ddc = 730 #date distance cutoff is 730, or 2 years

smTags4 <- smTags3 %>% 
  #Fix tags that are duplicate tags
  arrange(tag_id_new, date) %>% 
  group_by(tag_id_new) %>% 
  #create new column to identify when a new tagging event has occurred, defined by first tag event (is.na(date_dist)) and if it has been more than one year since animal was tagged compared to last tagging event 
  mutate(new_tag = ifelse(animal == "y" & is.na(date_dist), "y",
                          ifelse(animal == "y" & date_dist >= ddc, "y",
                                 ifelse(animal == "y" & date_dist < ddc, "n", NA))))
smTags4_display <- smTags4 %>% 
  dplyr::select(quarter, year, date, day_no, x, y, animal, recap, tag_id_new, date_dist, trap_dist, new_tag)
DT::datatable(smTags4_display, filter = "top")

4b. Identify duplicate tags

Duplicate tags are created using this protocol:

  1. Create a new column (n) that calculates the number of new tag events for each unique tag ID (e.g. if the same unique tag ID was recorded as being a new tag twice (based on the “new tag” column created in 4a), n = 2)

  2. Create a new column (tag_group) that creates a group number for each new tag event for that tag ID

  3. Fix duplicated tags by adding 1000 or 2000 to duplicated tags:

  • If n = 1, tag does not get changed

  • If n = 2, for the first tag (tag_group == 1), 1000 is added to the tag (e.g. if the original tag is 0001, the tag is now 1001); for the second tag (tag_group == 2) remains the same

  • If n = 3, for the first tag (tag_group == 1), 1000 is added to the tag (0001 -> 1001); for the second tag (tag_group == 2), 2000 is added (0001 -> 2001); for the third tag, there is no change.

Tags were not used more than 3x so there is no n = 4.

smTags5 <- smTags4 %>% 
  #make groups within tag; e.g. group 1 for first time tag used, group 2 for second time tag used, etc.
  ungroup() %>% 
  arrange(new_tag, tag_id_new, date) %>% #arrange data to get new tag events together
  group_by(new_tag, tag_id_new) %>% #group data 
  mutate(n = ifelse(new_tag == "y", n(), NA), #number of new tag events for each tag_id
         tag_group = ifelse(new_tag == "y", seq_along(1:n), NA)) %>% #number tag events for new tags
  ungroup() %>% 
  arrange(tag_id_new, date) %>% #re-arrange data to get tag id's together ordered by date
  fill(tag_group) %>% #fill in missing tag groups according to group id assigned to each new tag event
  
  #make duplicate tags unique by adding 1000's to old tags
  mutate(tag_no_prefix = ifelse(new_tag == "y" & n == 1, 0,
                                ifelse(new_tag == "y" & n == 2 & tag_group == 1, 1000,
                                       ifelse(new_tag == "y" & n == 2 & tag_group == 2, 0,
                                              ifelse(new_tag == "y" & n == 3 & tag_group == 1, 1000,
                                                     ifelse(new_tag == "y" & n == 3 & tag_group == 2, 2000,
                                                            ifelse(new_tag == "y" & n == 3 & tag_group == 3, 0, NA)))))),
         tag_no_new = as.numeric(tag_no_new),
         tag_no_prefix = as.numeric(tag_no_prefix)) %>% 
  fill(tag_no_prefix) %>% 
  mutate(tag_id_new = ifelse(!is.na(tag_prefix), tag_id_new, tag_no_prefix + tag_no_new),
         tag_id_new = ifelse(!is.na(tag_prefix), tag_id_new, str_pad(tag_id_new, 4, pad = "0")), #format numbers to 4 digits
         tag_id_change = ifelse(tag_no != tag_id_new, "y", "n")) #add column to indicate where tag ID has been identified as a duplicate tag and changed

III. To Do:

1. Misread/Mistranscribed tags

There are still tag ID’s that are incorrect from being misread or mistranscribed. It is too time consuming to proof every single tag. Thus, we will choose a random subset of ID’s and calculate a rate at which tags were misread/mistranscribed.

smTags6 <- smTags5 %>% 
  arrange(tag_id_new, date) %>% 
  #recalculate date_dist and trap_dist
  group_by(tag_id_new) %>% 
  mutate(date_dist = date - lag(date), #number of days between date and previous tag date
         trap_dist = sqrt((abs(x - lag(x))^2) + (abs(y - lag(y))^2))) #distance in meters from previous tag trap) 

#write_csv(smTags6, "/Users/tinacheng/Dropbox/SMURF_MS/Data/FERP_RodentData/DataMungingFiles/SMURF_Master_Tags.csv")
smTags6 %>%
  filter(species %in% c("californicus", "truei")) %>% 
  filter(date_dist < 400) %>% 
  ggplot() +
  geom_histogram(aes(x = trap_dist, fill = species), alpha = 0.4) +
  xlab("Distance (m): trap at timepoint (t-1) - trap at timepoint (t)") +
  ylab("# of trapping events")

p1 <- smTags6 %>%
  filter(species %in% c("californicus", "truei")) %>% 
  #filter(date_dist < 400) %>% 
  ggplot() +
  geom_vline(xintercept = 730, color = "grey") +
  geom_histogram(aes(x = date_dist, fill = species), alpha = 0.4) +
  xlab("Distance (m): trap at timepoint (t-1) - trap at timepoint (t)") +
  ylab("# of trapping events")

p2 <- smTags6 %>%
  filter(species %in% c("californicus", "truei")) %>% 
  filter(date_dist <= 730) %>% 
  ggplot() +
  geom_histogram(aes(x = date_dist, fill = species), alpha = 0.4) +
  xlab("Distance (m): trap at timepoint (t-1) - trap at timepoint (t)") +
  ylab("# of trapping events")

grid.arrange(p1, p2, ncol = 1)

Pick out random tag IDs to proof

First set of 100 tags to proof

utags <- unique(smTags6$tag_id_new)
tags2check <- sample(utags, size = 100, replace = F)

tags2check
##   [1] "0290"   "FE0039" "0537"   "2311"   "0975"   "0204"   "FE0099"
##   [8] "FE0126" "1978"   "FE0430" "0885"   "FT0470" "FT0340" "1079"  
##  [15] "FE0354" "0615"   "0683"   "0966"   "0565"   "0426"   "0647"  
##  [22] "FE0030" "2398"   "0098"   "FE0990" "1008"   "FE0160" "1191"  
##  [29] "2380"   "0331"   "0039"   "0629"   "0597"   "FE0176" "0937"  
##  [36] "0701"   "1303"   "FE0331" "0830"   "0194"   "1266"   "0844"  
##  [43] "0448"   "FE0128" "0953"   "FE0303" "FT0112" "0318"   "0123"  
##  [50] "FE0377" "0167"   "FE0317" "FE0045" "0601"   "0370"   "FE0383"
##  [57] "FE0302" "FE0389" "0986"   "0170"   "0902"   "FE0388" "0489"  
##  [64] "2386"   "0308"   "FE0456" "FE0103" "0810"   "0155"   "0969"  
##  [71] "0568"   "FE0170" "0596"   "0386"   "0272"   "1249"   "0965"  
##  [78] "1179"   "FE0143" "FE0097" "FE0017" "FE0247" "FE0290" "0109"  
##  [85] "0086"   "0982"   "0954"   "1022"   "FE0078" "0878"   "0252"  
##  [92] "FE0080" "FE0196" "0906"   "1402"   "1398"   "FE0426" "0265"  
##  [99] "0299"   "1327"

2. FE tags

Check FE tags where date distance < 365 where animals are recap to see if FE prefix is missing (# tagging events to check: 53)

smTags3_2check <- smTags3 %>%
  filter(FE_datediff >= 0) %>% 
  filter(animal == "y") %>% 
  filter(recap == "y") %>% 
  filter(is.na(tag_prefix_new)) 

#write_csv(smTags3_2check, "/Users/tinacheng/Dropbox/SMURF_MS/Data/FERP_RodentData/DataProofing/FEtags_2check.csv")

smTags3_2checkdis <-  smTags3_2check %>% dplyr::select(quarter, year, date, day_no, x, y, animal, recap, tag_id, tag_id_new, date_dist, trap_dist, FE_datediff, tag_prefix, tag_prefix_new, FEtag_change) 

DT::datatable(smTags3_2checkdis, filter = "top")

3. Check tag ID’s not recorded but where an animal was caught

Table 1. Summary of tagging events by species that have NO tag recorded. These events are often due to an escaped animal or in the case of non-Peromyscus species, the animal was not tagged.

#animals caught but with no tag read or animal not tagged

notag <- smTags6 %>% 
  filter(animal == "y") %>% 
  filter(is.na(tag_id_new)) %>% 
  mutate(Species = ifelse(!is.na(genus) & is.na(species), paste(genus, "sp."),
                          ifelse(is.na(genus), "*no sp recorded", 
                                 ifelse(genus == "Sorex", "Sorex sp.",
                                        ifelse(genus == "Neotoma", "Neotoma fuscipes",
                                               ifelse(genus == "Microtus", "Microtus californiucs",
                                                      ifelse(genus == "Reithrotodontomys", "Reithrodontomys megalotis",
                                                             ifelse(genus == "Peromyscus", paste(genus, species), "*no sp recorded")))))))) %>% 
  group_by(Species) %>% 
  summarise(`# tagging events w/ no tag recorded` = n())

kable(notag, format = "markdown", align = c('c'))
Species # tagging events w/ no tag recorded
*no sp recorded 116
Microtus californiucs 4
Neotoma fuscipes 30
Peromyscus californicus 43
Peromyscus sp. 9
Peromyscus truei 78
Sorex sp. 181
noPE <- smTags6 %>%
  filter(genus == "Peromyscus") %>% 
  nrow()

Do missing tags matter?

  • Total # Peromyscus tagging events: 3048
  • Peromyscus with no tag recorded: 130/3048 (4%)

  • Identified reasons out of 130: 117

  • Peromyscus not tagged b/c they escaped: 43/3048 (2%)
  • Peromyscus not tagged b/c ears torn (aka previously tagged but tag(s) lost): 38/3048 (1%)
  • Peromyscus not tagged b/c they were dead: 14/3048 (<1%)
  • Peromyscus not tagged b/c they were too small: 13/3048 (<1%)
notag_QY <- smTags6 %>% 
  filter(animal == "y") %>% 
  filter(is.na(tag_id_new)) %>% 
  mutate(Species = ifelse(!is.na(genus) & is.na(species), paste(genus, "sp."),
                          ifelse(is.na(genus), "*no sp recorded", 
                                 ifelse(genus == "Sorex", "Sorex sp.",
                                        ifelse(genus == "Neotoma", "Neotoma fuscipes",
                                               ifelse(genus == "Microtus", "Microtus californiucs",
                                                      ifelse(genus == "Reithrotodontomys", "Reithrodontomys megalotis",
                                                             ifelse(genus == "Peromyscus", paste(genus, species), "*no sp recorded")))))))) %>% 
  group_by(Species, quarter, year) %>% 
  summarise(n = n()) %>% 
  spread(key = Species, value = n) %>% 
  arrange(year)
#write_csv(notag_QY, "/Users/tinacheng/Dropbox/SMURF_MS/Figures/PrelimTables/notags.csv")

table(smTags6$no_tag_reason)